*** NOTE: SET PATH TO JOP Replication files 
*** GOAL: merge village level data with DID data for matching 

* Set working directory to "JOP Replication files" folder on your computer 

** NOTE: Run line 8-227
** Run CODE/GRAPHS/matchin.R and then run line 228 onwards 

clear
** 1. Load Data from IHDS I
use "DATA FILES TO SHARE/IHDS_RAW/22626-0007-Data.dta", clear


keep STATEID DISTID PSUID STATE DIST TEHSIL VILL POPCAT TOTHH AREA  VE9 VE91A VE92A ///
 VE91B VE92B VE91C VE92C VI1B VI2B VI3A VI3B  VI4A VI4C VI6A VI11B

rename (STATEID DISTID PSUID STATE DIST TEHSIL VILL POPCAT TOTHH AREA  VE9 VE91A ///
VE92A VE91B VE92B VE91C VE92C VI1B VI2B VI3A VI3B  VI4A VI4C VI6A VI11B) (STATEID DISTID ///
 PSUID STATE DIST TEHSIL VILL POPCAT TOTHH AREA mig_pop mig_ind1_dist ///
 mig_ind1_loc mig_ind2_dist mig_ind2_loc mig_ind3_dist mig_ind4_loc dist_town dist_district ///
 vill_road dist_pucca vill_elec vill_elec_hrs vill_mob vill_bus)
 

*** 1a. Make unique ID 

** there are duplicates bc 03 is the same as 3 so state 3 with dist 11 is same as state 31 with dist 1
** So add leading 0s 


gen str6 uid = string(STATE,"%02.0f") + string(DIST,"%02.0f") + string(PSUID,"%02.0f")
** check for duplicates
sort uid
 quietly by uid:  gen dup = cond(_N==1,0,_n)
 tab dup
 ** ther are no duplicates 

 *create a column with year
 gen year = 2005 
** save this file (there are 1501 villages in this DF)
 save "DATA FILES TO SHARE/TEMP_FILES/vill_05.dta", replace
clear


** 2. Load Data from IHDS II
use "DATA FILES TO SHARE/IHDS_RAW/36151-0012-Data.dta", clear

keep STATEID DISTID PSUID STATE DISTA VILL VE9F VE10 VE10A1 VE10A2 VE10B1 VE10B2 ///
 VE10C1 VE10C2 VE11 VI1B VI2B VI3 VI3A VI4A VI4D VI6  VI13B
 
 rename (STATEID DISTID PSUID STATE DISTA VILL VE9F VE10 VE10A1 VE10A2 VE10B1 VE10B2 ///
 VE10C1 VE10C2 VE11 VI1B VI2B VI3 VI3A VI4A VI4D VI6  VI13B) (STATEID DISTID ///
 PSUID STATE DIST VILL mig_daily mig_pop mig_ind1_dist mig_ind1_loc mig_ind2_dist ///
 mig_ind2_loc mig_ind3_dist mig_ind4_loc mig_hh dist_town dist_district ///
 vill_road dist_pucca vill_elec vill_elec_hrs vill_mob vill_bus)
 
 ** there are duplicates bc 03 is the same as 3 so state 3 with dist 11 is same as state 31 with dist 1
** So add leading 0s 

gen str6 uid = string(STATE,"%02.0f") + string(DIST,"%02.0f") + string(PSUID,"%02.0f")
** check for duplicates
sort uid
 quietly by uid:  gen dup = cond(_N==1,0,_n)
 tab dup
 ** there is 2 duplicates 
 * dropping them - they have the same district code from 2001
keep if dup<1

**create a new variable with year
gen year = 2012
** save this file (there are 1408 villages in this DF)
 save "DATA FILES TO SHARE/TEMP_FILES/vill_12.dta", replace
 
 
 **3. Append both 
clear
use "DATA FILES TO SHARE/TEMP_FILES/vill_05.dta"
 append using "DATA FILES TO SHARE/TEMP_FILES/vill_12.dta"
 
** check how many villages have sam UID in both years 

sort uid
 quietly by uid:  gen dup2 = cond(_N==1,0,_n)
 tab dup2
 
 ** there are 1404 villages that match between both rounds (anyway there were only 1408 that 
 ** they were able re-interview between both rounds) So ideally just a loss of 8 
 ** keep only the 1404 villages 
 

 drop if dup2 == 0
 save "DATA FILES TO SHARE/TEMP_FILES/vill_0512.dta", replace


*******************************************
************ 4. MERGING THE VILlAGE AND EW DATA ************
************ Note: the EW data only includes women with non-migrant husbands in round 1
************ So not all EW women are included in it. 

*******************************************************
**************** 4.1. call the EW_DID _final data 

use "DATA FILES TO SHARE/main_ew_hh_df.dta", clear

** create a unique ID at the village level bc you want to merge the village DF 
gen str6 uid = string(STATEID,"%02.0f") + string(DISTID,"%02.0f") + string(PSUID,"%02.0f")

*** how many unique IDS do we have at the PSU level

egen x = group(uid)
sum x

** we have  2358 unique villages/urban areas in which information for indiv has been collected from 


*******************************************************
**************** 4.2. keep only 2011 

keep if year == 1

merge m:1 uid using  "DATA FILES TO SHARE/TEMP_FILES/vill_12.dta"

** drop the ones that did not merge

keep if _merge == 3 
** now group the uids to see how many unique are there 
drop x
** re-do the var
egen x = group(uid)
sum x

 save "DATA FILES TO SHARE/TEMP_FILES/vill_ew_12.dta", replace
 
 ** We have 1361 unique ids - that is data on Eligible women from 1361 villages. 
 ** There are 47 missing villages (as per the IHDS documents there are 1420 villages, village data was only avaialble for 1408)
 *** ( and we lost 47 in the merge). 

*******************************************************
**************** 4.3. Repeat for IHDS I 
clear


use "DATA FILES TO SHARE/main_ew_hh_df.dta"

** create a unique ID at the village level bc you want to merge the village DF 
gen str6 uid = string(STATEID,"%02.0f") + string(DISTID,"%02.0f") + string(PSUID,"%02.0f")

*** how many unique IDS do we have at the PSU level
egen x = group(uid)
sum x

** we have 2358 unique villages/urban areas in which information for indiv has been collected from 


*******************************************************
**************** 4.4. keep only 2005 
keep if year == 0
merge m:1 uid using  "DATA FILES TO SHARE/TEMP_FILES/vill_05.dta"

** drop the ones that did not merge

keep if _merge == 3 
** now group the uids to see how many unique are there 
drop x
** re-do the var
egen x = group(uid)
sum x

 save "DATA FILES TO SHARE/TEMP_FILES/vill_ew_05.dta", replace
** We have 1433 unique ids - that is data on Eligible women from 1430 villages. 


 *************************************************
 *************** 5. Append villae + EW data 
 clear
  use "DATA FILES TO SHARE/TEMP_FILES/vill_ew_05.dta"
keep uid
 save "DATA FILES TO SHARE/TEMP_FILES/uid_05.dta", replace
clear
 use "DATA FILES TO SHARE/TEMP_FILES/vill_ew_12.dta"
 drop _merge

merge m:m uid using "DATA FILES TO SHARE/TEMP_FILES/uid_05.dta"
keep if _merge==3
 save "DATA FILES TO SHARE/TEMP_FILES/vill_ew_unique12.dta", replace
keep uid
 save "DATA FILES TO SHARE/TEMP_FILES/uid_12.dta", replace

clear
 use "DATA FILES TO SHARE/TEMP_FILES/vill_ew_05.dta"
 drop _merge
 merge m:m uid using "DATA FILES TO SHARE/TEMP_FILES/uid_12.dta"
keep if _merge==3
 

append using "DATA FILES TO SHARE/TEMP_FILES/vill_ew_unique12.dta"
drop x
** re-do the var
egen x = group(uid)
sum x
save "DATA FILES TO SHARE/TEMP_FILES/vill_ew_merge.dta", replace


gen str12 person_uid = string(STATEID,"%02.0f") + string(DISTID,"%02.0f") ///
+ string(PSUID,"%02.0f") + string(HHID2005,"%02.0f") + string(HHSPLITID2005,"%02.0f") + string(PERSONID2005,"%02.0f")

drop x
** re-do the var
egen x = group(person_uid)
sum x

drop dup
sort person_uid
quietly by person_uid:  gen dup = cond(_N==1,0,_n)
tab dup

*** No DUP in the DATA *** 

keep if did_sample == 1
keep if year == 0
*do "/Users/rithika/Dropbox/Dissertation/JOP Replication files/CODE/DATA/CODE/ihds_changesforrg_didindex.do"

save "DATA FILES TO SHARE/TEMP_FILES/match_ew_vill.dta",replace

*** Now take this to R and do the PSM Matching **** 

** it is done in JOP Replication files/CODE/GRAPHS/matching.R 
*** RUN CODE/GRAPHS/matching.R 



use "DATA FILES TO SHARE/TEMP_FILES/match_w1.dta", clear

keep STATEID DISTID PSUID HHID2005 HHSPLITID2005 PERSONID2005 

** we have 4062 observations in the matched dataset for Wave 1

** Merging this data with the main EW data to identify the women who were included in the matched sample to run 
** DID on that reduced subset eventually 

merge 1:m STATEID DISTID PSUID HHID2005 HHSPLITID2005 PERSONID2005  using "DATA FILES TO SHARE/main_ew_hh_df.dta" ,generate(match_merge)


/*

    Result                      Number of obs
    -----------------------------------------
    Not matched                        41,614
        from master                         0  (match_merge==1)
        from using                     41,614  (match_merge==2)

    Matched                             8,124  (match_merge==3)
    -----------------------------------------

*/

keep if match_merge == 3

** Let us check for duplicates - we should have 2 of each UID (i.e. individual) since it is a panel

gen str12 person_uid = string(STATEID,"%02.0f") + string(DISTID,"%02.0f") ///
+ string(PSUID,"%02.0f") + string(HHID2005,"%02.0f") + string(HHSPLITID2005,"%02.0f") + string(PERSONID2005,"%02.0f")

** re-do the var

egen x = group(person_uid)
sum x

** Each indv has 2 obs 

sort person_uid
quietly by person_uid:  gen dup2 = cond(_N==1,0,_n)
tab dup2 

/*

       dup2 |      Freq.     Percent        Cum.
------------+-----------------------------------
          1 |      4,062       50.00       50.00
          2 |      4,062       50.00      100.00
------------+-----------------------------------
      Total |      8,124      100.00

*/

keep mig_husb year w2_abshusband_dummy pol_score_norm dec_score_norm mob_score_norm GR_hh_cash vill_id_num hhuid_num

save "DATA FILES TO SHARE/match_w1.dta", replace



